Here we show the code to reproduce the analyses of: Risso and Pagnotta (2020). Per-sample standardization and asymmetric winsorization lead to accurate classification of RNA-seq expression profiles. In preparation.
This file belongs to the repository: https://github.com/drisso/awst_analysis.
The code is released with license GPL v3.0.
awstif (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("drisso/awst")
… We then created \(k=5\) groups of samples, each made of \(M=30\) replicate samples, randomly selected (with replacement) from the original set of 80. For each group, we randomly selected (without replacement) \(J=500\) genes, whose expression was altered according to the following multiplicative model. \[ \tilde{y}_j = y_j \cdot (0.001 + r_j), \quad j=1,\ldots,J, \] where \(y_j\) denotes the observed expression of gene \(j\), \(\tilde{y}\) denotes the perturbed expression, and \(r_j\) is the realization of a Gamma random variable with shape parameter \(a=0.5\) and scale parameter \(s=1\).
#BiocManager::install("seqc") # uncomment if necessary
rm(list = ls())
set.seed(20200413)
library(seqc)
ddata <- get("ILM_aceview_gene_MAY")
ddata <- ddata[!is.na(ddata$EntrezID),]
ddata <- ddata[!ddata$IsERCC,]
feature_annotation <- data.frame(ddata[, 1:3], row.names = ddata$EntrezID)
dim(ddata <- as.matrix(ddata[, -c(1:4)])) #[1] 19701 384
sum(duplicated(feature_annotation$Symbol)) #[1] 0
sum(duplicated(feature_annotation$EntrezID)) #[1] 0
row.names(ddata) <- feature_annotation$EntrezID
dim(sample_annotation.df <- data.frame(row.names = colnames(ddata), sample = colnames(ddata))) #[1] 384 1
sample_annotation.df$lane <- substr(sample_annotation.df$sample, 6, 7)
sample_annotation.df$flow_cell <- substr(sample_annotation.df$sample, 17, 17)
sample_annotation.df$sample <- substr(sample_annotation.df$sample, 1, 3)
sample_annotation.df <- sample_annotation.df[grep("A", sample_annotation.df$sample),]
dim(ddata <- ddata[, rownames(sample_annotation.df)]) #[1] 19701 80
M <- 30
nnumbers <- c("01", "02", "03", "04", "05", "06", "07", "08", "09", paste(10:M))
tmp <- c(paste0("A", nnumbers), paste0("B", nnumbers), paste0("C", nnumbers),
paste0("D", nnumbers), paste0("E", nnumbers))
design.df <- data.frame(row.names = tmp, sample = tmp, original.sample = NA)
design.df$original.sample[1:M] <- sample(rownames(sample_annotation.df), M, replace = TRUE)
design.df$original.sample[(M+1):(2*M)] <- sample(rownames(sample_annotation.df), M, replace = TRUE)
design.df$original.sample[(2*M+1):(3*M)] <- sample(rownames(sample_annotation.df), M, replace = TRUE)
design.df$original.sample[(3*M+1):(4*M)] <- sample(rownames(sample_annotation.df), M, replace = TRUE)
design.df$original.sample[(4*M+1):(5*M)] <- sample(rownames(sample_annotation.df), M, replace = TRUE)
# table(design.df$original.sample)
k <- "A"
wwhich <- grep(k, design.df$sample)
synthetic_data <- ddata[, design.df$original.sample[wwhich]]
colnames(synthetic_data) <- design.df$sample[wwhich]
the_genes <- rownames(synthetic_data)
no_of_altered_genes <- 500
genes_to_alterate <- sample(the_genes, no_of_altered_genes, replace = FALSE)
the_genes <- the_genes[-which(the_genes %in% genes_to_alterate)]
for(jj in 1:M) {
tmp <- 0.001 + rgamma(length(genes_to_alterate), shape = 0.5, scale = 1)
synthetic_data[genes_to_alterate, jj] <- synthetic_data[genes_to_alterate, jj] * tmp
}
# k <- "B"
for(k in c("B", "C", "D", "E")) {
wwhich <- grep(k, design.df$sample)
tmp_data <- ddata[, design.df$original.sample[wwhich]]
colnames(tmp_data) <- design.df$sample[wwhich]
genes_to_alterate <- sample(the_genes, no_of_altered_genes, replace = FALSE)
the_genes <- the_genes[-which(the_genes %in% genes_to_alterate)]
for(jj in 1:M) {
tmp <- 0.001 + rgamma(length(genes_to_alterate), shape = 0.5, scale = 1)
tmp_data[genes_to_alterate, jj] <- tmp_data[genes_to_alterate, jj] * tmp
}
synthetic_data <- cbind(synthetic_data, tmp_data)
}
dim(ddata <- floor(synthetic_data)) #[1] 19701 150
annotation.df <- data.frame(samples = colnames(ddata), row.names = colnames(ddata))
annotation.df$sample <- substr(annotation.df$samples, 1, 1)
annotation.df$sample.col <- factor(annotation.df$sample)
levels(annotation.df$sample.col) <- clust.col <- c("gold", "red", "green2", "blue", "cyan")
names(clust.col) <- unique(annotation.df$sample)
save(ddata, annotation.df, feature_annotation, clust.col, file = "synthetic20200413.RData")
#tmp <- cbind(annotation.df, t(ddata))
#write.table(tmp, file = "synthetic20200413.tsv", sep = "\t", quote = FALSE, row.names = FALSE)
#write.table(feature_annotation, file = "synthetic20200413_features_annotation.tsv", sep = "\t", quote = FALSE, row.names = FALSE)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## adjusted-Rand index (clues): 0.9832
Result of ConsensuClusterPlus with default setup
(innerLinkage=“average”, finalLinkage=“average”, distance=“pearson”)
## adjusted-Rand index (clues): 1
|
|
|
|
|
|
Result of ConsensuClusterPlus with clusterAlg = “pam” and distance = “euclidean” (other parameters left to default)
## adjusted-Rand index (clues): 1
|
|
|
|
|
|
Finding the active genes in deep RNA-seq gene expression studies
## adjusted-Rand index (clues): 0.6779
Result of ConsensuClusterPlus with default setup
(innerLinkage=“average”, finalLinkage=“average”, distance=“pearson”)
## adjusted-Rand index (clues): 0.6952
|
|
|
|
|
|
Result of ConsensuClusterPlus with clusterAlg = “pam” and distance = “euclidean” (other parameters left to default)
## adjusted-Rand index (clues): 1
|
|
|
|
|
|
The Integrated Genomic Landscape of Thymic Epithelial Tumors
## adjusted-Rand index (clues): 0.0848
Result of ConsensuClusterPlus with default setup
(innerLinkage=“average”, finalLinkage=“average”, distance=“pearson”)
## adjusted-Rand index (clues): 1
|
|
|
|
|
|
Result of ConsensuClusterPlus with clusterAlg = “pam” and distance = “euclidean” (other parameters left to default)
## adjusted-Rand index (clues): 0.1212
|
|
|
|
|
|
Comprehensive, Integrative Genomic Analysis of Diffuse Lower-Grade Gliomas - Supplementary Appendix (see pages 23-24)
## adjusted-Rand index (clues): 0.0013
Result of ConsensuClusterPlus with default setup
(innerLinkage=“average”, finalLinkage=“average”, distance=“pearson”)
## adjusted-Rand index (clues): 0.0947
|
|
|
|
|
|
Result of ConsensuClusterPlus with clusterAlg = “pam” and distance = “euclidean” (other parameters left to default)
## adjusted-Rand index (clues): 0.0013
|
|
|
|
|
|
## adjusted-Rand index (clues): 0.4637
Result of ConsensuClusterPlus with default setup
(innerLinkage=“average”, finalLinkage=“average”, distance=“pearson”)
## adjusted-Rand index (clues): 0.5208
|
|
|
|
|
|
|
Result of ConsensuClusterPlus with clusterAlg = “pam” and distance = “euclidean” (other parameters left to default)
## adjusted-Rand index (clues): 1
|
|
|
|
|
|
|
## adjusted-Rand index (clues): 0.0013
Result of ConsensuClusterPlus with default setup
(innerLinkage=“average”, finalLinkage=“average”, distance=“pearson”)
## adjusted-Rand index (clues): 0.1812
|
|
|
|
|
|
|
Result of ConsensuClusterPlus with clusterAlg = “pam” and distance = “euclidean” (other parameters left to default)
## adjusted-Rand index (clues): 0.6112
|
|
|
|
|
|
|
| AWST | Hart | Radovich | TCGA | FPKM2500 | FPKM5K | |
|---|---|---|---|---|---|---|
| HC | 0.9832 | 0.6779 | 0.0848 | 0.0013 | 0.4637 | 0.0013 |
| CCP1 | 1.0000 | 0.6952 | 1.0000 | 0.0947 | 0.5208 | 0.1812 |
| CCP2 | 1.0000 | 1.0000 | 0.1212 | 0.0013 | 1.0000 | 0.6112 |
HC) hirerachical clustering with Ward’s likage and Euclidean distance; CPP1) ConsensusClusterPlus with average innner and outer linkage, and Pearson’s correlation as distance; CCP2) ConsensusClusterPlus with PAM and Euclidean distance;
## R version 3.4.4 (2018-03-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 19.1
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=it_IT.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=it_IT.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=it_IT.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=it_IT.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] EDASeq_2.10.0 ShortRead_1.34.2
## [3] GenomicAlignments_1.12.2 SummarizedExperiment_1.6.5
## [5] DelayedArray_0.2.7 matrixStats_0.56.0
## [7] Rsamtools_1.28.0 GenomicRanges_1.28.6
## [9] GenomeInfoDb_1.12.3 Biostrings_2.44.2
## [11] XVector_0.16.0 IRanges_2.10.5
## [13] S4Vectors_0.14.7 BiocParallel_1.10.1
## [15] Biobase_2.36.2 BiocGenerics_0.22.1
## [17] heatmap3_1.1.7 steFunctions_2019.04.29
## [19] awst_0.0.4 clues_0.6.2.2
## [21] dendextend_1.13.4 cluster_2.0.6
## [23] knitr_1.28
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-6 bit64_0.9-7 doParallel_1.0.15
## [4] RColorBrewer_1.1-2 tools_3.4.4 R6_2.4.1
## [7] DBI_1.1.0 colorspace_1.4-1 tidyselect_1.0.0
## [10] gridExtra_2.3 bit_1.1-15.2 compiler_3.4.4
## [13] rtracklayer_1.36.6 scales_1.1.0 genefilter_1.58.1
## [16] DESeq_1.28.0 stringr_1.4.0 digest_0.6.25
## [19] rmarkdown_2.1 R.utils_2.9.2 pkgconfig_2.0.3
## [22] htmltools_0.4.0 highr_0.8 rlang_0.4.5
## [25] RSQLite_2.2.0 hwriter_1.3.2 dplyr_0.8.5
## [28] R.oo_1.23.0 RCurl_1.98-1.1 magrittr_1.5
## [31] GenomeInfoDbData_0.99.0 Matrix_1.2-18 Rcpp_1.0.4
## [34] munsell_0.5.0 viridis_0.5.1 lifecycle_0.2.0
## [37] R.methodsS3_1.8.0 stringi_1.4.6 yaml_2.2.1
## [40] zlibbioc_1.22.0 grid_3.4.4 blob_1.2.1
## [43] crayon_1.3.4 lattice_0.20-35 splines_3.4.4
## [46] GenomicFeatures_1.28.5 annotate_1.54.0 pillar_1.4.3
## [49] fastcluster_1.1.25 geneplotter_1.54.0 codetools_0.2-15
## [52] biomaRt_2.32.1 XML_3.99-0.3 glue_1.3.2
## [55] evaluate_0.14 latticeExtra_0.6-28 vctrs_0.2.4
## [58] foreach_1.4.8 gtable_0.3.0 purrr_0.3.3
## [61] assertthat_0.2.1 ggplot2_3.3.0 xfun_0.12
## [64] aroma.light_3.6.0 xtable_1.8-4 survival_3.1-12
## [67] viridisLite_0.3.0 tibble_2.1.3 iterators_1.0.12
## [70] AnnotationDbi_1.38.2 memoise_1.1.0